# standard imports I always use
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
import pytz
from pytz import common_timezones, all_timezones
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
from datetime import datetime
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
cmshelp to fetch and clean the CMS synthetic files¶This notebook is just an example of how to use the CMS synthetic files with pandas. All of the columns in the different files that correspond to different ICD9 codes and HCPCS codes are retained, but additional columns are added with the code description.
First import the module and call this function to get the urls for the 20 different samples:
cmshelp.get_url_list_sample_i(i=1) takes a single argument corresponding to which of the 20 samples of the synthetic data
you want to fetch.
import cmshelp
url_list_1 = cmshelp.get_url_list_sample_i(1)
url_list_2 = cmshelp.get_url_list_sample_i(2)
url_list_1
The 3 beneficiary file urls in each of the 20 samples are at indices 0, 6, and 7.
To make the corresonding dataframes, call the function
cmshelp.make_clean_bene(url) which takes a url as an argument.
We will change this appropriately when working with files.
The date type columns BENE_BIRTH_DT and BENE_DEATH_DT have been turned into pandas datetime objects.
And the SP_STATE_CODE information has been used to create three columns elevatoin, lat, lng, as well as the text columns state and county.
dfbene_sample1_0 = cmshelp.make_clean_bene(url_list_1[0])
dfbene_sample1_0.columns
dfbene_sample1_0.plot.scatter('lng','lat') # no Alaska??
You need plotly installed.
You can install it within a notebook by running
!pip install plotly
within a cell.
import plotly.plotly as py
import plotly.graph_objs as go
import pandas as pd
import plotly
plotly.offline.init_notebook_mode()
data = []
cluster = []
#colors = ['rgb(228,26,28)','rgb(55,126,184)','rgb(77,175,74)']
for i in range(len(dfbene_sample1_0['State'].unique())):
name = dfbene_sample1_0['State'].unique()[i]
#color = colors[i]
x = dfbene_sample1_0[ dfbene_sample1_0['State'] == name ]['lng']
y = dfbene_sample1_0[ dfbene_sample1_0['State'] == name ]['lat']
z = dfbene_sample1_0[ dfbene_sample1_0['State'] == name ]['elevation']
trace = dict(
name = name,
x = x, y = y, z = z,
type = "scatter3d",
mode = 'markers',
marker = dict( size=3, line=dict(width=0) ) )
data.append( trace )
layout = dict(
width=800,
height=550,
autosize=False,
title='Beneficiary file',
scene=dict(
xaxis=dict(
gridcolor='rgb(255, 255, 255)',
zerolinecolor='rgb(255, 255, 255)',
showbackground=True,
backgroundcolor='rgb(230, 230,230)'
),
yaxis=dict(
gridcolor='rgb(255, 255, 255)',
zerolinecolor='rgb(255, 255, 255)',
showbackground=True,
backgroundcolor='rgb(230, 230,230)'
),
zaxis=dict(
gridcolor='rgb(255, 255, 255)',
zerolinecolor='rgb(255, 255, 255)',
showbackground=True,
backgroundcolor='rgb(230, 230,230)'
),
aspectratio = dict( x=1, y=1, z=0.7 ),
aspectmode = 'manual'
),
)
fig = dict(data=data, layout=layout)
#py.iplot(fig, filename='pandas-3d-benefile', validate=False)
plotly.offline.iplot(fig, validate=False)
Plotting the information in the datetype columns:
dfbene_sample1_0['BENE_BIRTH_DT'].dt.year.value_counts().sort_index().plot(kind='bar',figsize=(30,10))
del dfbene_sample1_0
Carrier claims files are in indeces 1 and 2 of the url lists.
You can make the corresponding dataframe by calling
cmshelp.make_clean_carrier(url)
dfcarrier_sample1_1 = cmshelp.make_clean_carrier(url_list_1[1])
list(dfcarrier_sample1_1.columns)
dfcarrier_sample1_1[['ICD9_DGNS_CD_2', 'ICD9_DGNS_CD_2_description']].head()
dfcarrier_sample1_1[['HCPCS_CD_1','HCPCS_CD_1_description']].head()
dfcarrier_sample1_1[['LINE_PRCSG_IND_CD_1']].head()
dfcarrier_sample1_1[['LINE_ICD9_DGNS_CD_1','LINE_ICD9_DGNS_CD_1_description']].head()
del dfcarrier_sample1_1
dfinpatient_sample1 = cmshelp.make_clean_inpatient(url_list_1[3])
list(dfinpatient_sample1.columns)
dfinpatient_sample1[['ADMTNG_ICD9_DGNS_CD', 'ADMTNG_ICD9_DGNS_CD_description']].head()
dfinpatient_sample1[['ICD9_DGNS_CD_1','ICD9_DGNS_CD_1_description']].head()
dfinpatient_sample1[['ICD9_PRCDR_CD_1','ICD9_PRCDR_CD_1_description']].head()
dfinpatient_sample1[['CLM_DRG_CD','CLM_DRG_CD_description']].head()
dfoutpatient = cmshelp.make_clean_outpatient(url_list_1[4])
list(dfoutpatient.columns)
dfdrugs = cmshelp.make_clean_drugs(url_list_1[5])
list(dfdrugs.columns)